raw.dta<-read_csv("../data/FSA_data_competition_2020.csv", na = c("NA", ":", "unclassified", "unknown")) %>%
rename("ID" = "ID",
"date_added" = "Date Added",
"date_published" = "Date of Publishing",
"data_source" = "Data Source",
"source_type" = "Source Type",
"alert_type" = "Alert Type",
"raw_text_product" = "Raw Product Phrase",
"product_categoty" = "Product Category",
"product" = "Commodity / Product",
"origin_country" = "Country of Origin",
"origin_country_EU" = "Eu/non-EU Country of Origin",
"notified_country" = "Notified by",
"notified_country_EU" = "EU/non-EU Notifying Country",
"incident_title" = "Incident Title",
"hazard_description" = "Hazard Description", # can extract about ecoli fro here
"hazard_group" = "Hazard Group",
"summary" = "Summary",
"link" = "Link",
"food_feed_fcm" = "Food; Feed or FCM",
"manufacturer" = "Manufacturer",
"brand" = "Brand",
"organisation" = "Organisations",
"food_or_not" = "Is A Food Article" )
# basic tidy
data <- raw.dta %>%
select(-food_or_not, -incident_title) %>%
mutate(food_feed_fcm = ifelse(food_feed_fcm == 'FCM', 'fcm',
ifelse(food_feed_fcm =='Food', 'food', food_feed_fcm))) %>%
filter(food_feed_fcm != "fcm")
data %>% arrange(date_published) %>% vis_miss()

# new cols
data <- data %>%
# tidy up dates using lubridate
mutate(date_added = dmy(date_added),
date_published = dmy(date_published)) %>%
mutate(date_added_year = year(date_added),
date_published_year = year(date_published)) %>%
mutate(date_published_month = ifelse(nchar(month(date_published)) == 2, month(date_published), paste0(0, month(date_published))),
# create year_month
date_published_year_month = paste0(date_published_year, "-", date_published_month),
#create year_quarter
date_published_quarter = ifelse(date_published_month %in% c("01", "02", "03"), "Q1",
ifelse(date_published_month %in% c("04", "05", "06"), "Q2",
ifelse(date_published_month %in% c("07", "08", "09"), "Q3",
ifelse(date_published_month %in% c("10", "11", "12"), "Q4", NA)))),
date_published_year_quarter = paste0(date_published_year, "-", date_published_quarter),
date_published_fulltime = paste0(date_published, " 00:00:00 +00:00")) %>%
# create incident ID
separate(ID, into= c("ID", "ID_incident"), sep= "-", remove=F) %>%
mutate(ID_incident = ifelse(is.na(ID_incident), ID, ID_incident))
# 2015-01-15 19:05:39 +00:00
data %>% count(date_added_year)
## # A tibble: 2 x 2
## date_added_year n
## <dbl> <int>
## 1 2019 21876
## 2 2020 10244
data %>% count(date_published_year)
## # A tibble: 6 x 2
## date_published_year n
## <dbl> <int>
## 1 2016 4306
## 2 2017 6497
## 3 2018 8228
## 4 2019 10149
## 5 2020 2937
## 6 NA 3
data %>% count(product) %>% arrange(-n)
## # A tibble: 3,490 x 2
## product n
## <chr> <int>
## 1 <NA> 1973
## 2 chicken 1951
## 3 bakery product 1935
## 4 beef 1681
## 5 meat product 995
## 6 pepper 816
## 7 food supplement 720
## 8 cheese 660
## 9 pork 644
## 10 sesame 540
## # … with 3,480 more rows
data %>% count(alert_type) %>% arrange(-n)
## # A tibble: 10 x 2
## alert_type n
## <chr> <int>
## 1 recall 14064
## 2 border rejection 5504
## 3 alert 3649
## 4 information for attention 2692
## 5 update 2082
## 6 information for follow-up 1941
## 7 outbreak 1863
## 8 warning 182
## 9 information 109
## 10 lookout 34
data %>% count(data_source) %>% arrange(-n)
## # A tibble: 43 x 2
## data_source n
## <chr> <int>
## 1 RASFF Portal 11906
## 2 CDPH Recalls (Canada) 5075
## 3 Food Poisoning Bulletin (US) 2133
## 4 Ministry of Health - Border Rejections (Japan) 1850
## 5 FoodSafetyNews.com 1495
## 6 FSA Alerts & Recalls (UK) 1427
## 7 MAPAQ (Canada) 1230
## 8 FDA Recalls (USA) 904
## 9 Product Recalls Website: Oulah (France) 896
## 10 AFSCA Recalls (Belgium) 614
## # … with 33 more rows
data %>% count(hazard_group) %>% arrange(-n)
## # A tibble: 35 x 2
## hazard_group n
## <chr> <int>
## 1 microbial contaminants (other) 5831
## 2 pathogenic micro-organisms 5588
## 3 allergens 5221
## 4 <NA> 3171
## 5 foreign bodies 1896
## 6 composition 1536
## 7 poor or insufficient controls 1192
## 8 pesticide residues 1149
## 9 heavy metals 869
## 10 Fraud 783
## # … with 25 more rows
data %>% count(origin_country) %>% arrange(-n)
## # A tibble: 151 x 2
## origin_country n
## <chr> <int>
## 1 Canada 6508
## 2 USA 4763
## 3 United Kingdom 1966
## 4 France 1823
## 5 Italy 1130
## 6 China 1040
## 7 Belgium 1018
## 8 Poland 885
## 9 Netherlands 840
## 10 Turkey 826
## # … with 141 more rows
ggplot(data, aes(x = data_source, fill = alert_type)) +
geom_bar()+
theme_minimal_hgrid(10, rel_small = 1)+
#facet_grid(~alert_type)+
#scale_fill_manual(values=pal)+
coord_flip()+
labs(fill = "title", y="") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Hazard categoty overview by Year
dat1<-data %>%
filter(!is.na(hazard_group),
date_published_year_month != "NA-NA") %>%
group_by(hazard_group) %>%
filter(n()>300) %>%
ungroup %>%
select(hazard_group, date_published_year ) %>%
group_by(hazard_group, date_published_year) %>%
mutate(count = n())%>%
distinct()
pc <-ggplot(data = dat1, aes(x = date_published_year , y = count, group = hazard_group)) +
geom_line(aes(color = hazard_group, alpha = 1), size = 1) +
geom_point(aes(color = hazard_group, alpha = 1), size = 3) +
#scale_x_continuous(breaks = sort(unique(dat$date_published_year))[c(TRUE, FALSE)] )+
theme(legend.position = "right", ncol=1) +
#scale_colour_manual(values=c(pal))+
theme_minimal_hgrid(10, rel_small = 1) +
labs(x = "year", colour="hazard group",
y = "counts",
title = "")+
guides(alpha = FALSE)
pc

# Hazard categoty overview by Month (all records)
dat2<-data %>%
filter(!is.na(hazard_group),
date_published_year_month != "NA-NA") %>%
group_by(hazard_group) %>%
filter(n()>500) %>%
ungroup %>%
select(hazard_group, date_published_year_month ) %>%
group_by(hazard_group, date_published_year_month) %>%
mutate(count = n())%>%
distinct()
pc <-ggplot(data = dat2, aes(x = date_published_year_month , y = count, group = hazard_group)) +
geom_line(aes(color = hazard_group, alpha = 1), size = 1) +
geom_point(aes(color = hazard_group, alpha = 1), size = 3) +
#scale_x_continuous(breaks = sort(unique(dat$date_published_year))[c(TRUE, FALSE)] )+
theme(legend.position = "right", ncol=1) +
#scale_colour_manual(values=c(pal))+
theme_minimal_hgrid(9, rel_small = 1) +
labs(x = "year", colour="hazard group",
y = "counts",
title = "")+
guides(alpha = FALSE) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
pc

# Hazard categoty overview by Month (report multiple issue related to one event as one event)
dat3<-data %>%
filter(!is.na(hazard_group),
date_published_year_month != "NA-NA") %>%
group_by(hazard_group) %>%
filter(n()>500) %>%
ungroup %>%
select(hazard_group, date_published_year_month, ID_incident) %>% distinct() %>%
group_by(hazard_group, date_published_year_month) %>%
mutate(count = n())%>%
distinct()
pc3 <-ggplot(data = dat3, aes(x = date_published_year_month , y = count, group = hazard_group)) +
geom_line(aes(color = hazard_group, alpha = 1), size = 1) +
geom_point(aes(color = hazard_group, alpha = 1), size = 3) +
#scale_x_continuous(breaks = sort(unique(dat$date_published_year))[c(TRUE, FALSE)] )+
theme(legend.position = "right", ncol=1) +
#scale_colour_manual(values=c(pal))+
theme_minimal_hgrid(9, rel_small = 1) +
labs(x = "year", colour="hazard group",
y = "counts",
title = "")+
guides(alpha = FALSE) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
pc3

# Main 3 categories, with smooth lines
dat4<-data %>%
filter(!is.na(hazard_group),
date_published_year_month != "NA-NA") %>%
group_by(hazard_group) %>%
#filter(n()>500) %>%
ungroup %>%
select(hazard_group, date_published_year_month, ID_incident) %>% distinct() %>%
group_by(hazard_group, date_published_year_month) %>%
filter(hazard_group %in% c("microbial contaminants (other)", "pathogenic micro-organisms", "allergens")) %>%
mutate(count = n())%>%
distinct()
pc4 <-ggplot(data = dat4, aes(x = date_published_year_month , y = count, group = hazard_group)) +
#geom_line(aes(color = hazard_group, alpha = 1), size = 1) +
geom_point(aes(color = hazard_group, alpha = 1), size = 3) +
geom_smooth(aes(x = date_published_year_month , y = count, color = hazard_group))+
#scale_x_continuous(breaks = sort(unique(dat$date_published_year))[c(TRUE, FALSE)] )+
theme(legend.position = "right", ncol=1) +
#scale_colour_manual(values=c(pal))+
theme_minimal_hgrid(9, rel_small = 1) +
labs(x = "year", colour="hazard group",
y = "counts",
title = "")+
guides(alpha = FALSE) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
pc4

# + other categories smooth line
dat5<-data %>%
filter(!is.na(hazard_group),
date_published_year_month != "NA-NA") %>%
group_by(hazard_group) %>%
filter(n()>500) %>%
ungroup %>%
select(hazard_group, date_published_year_month, ID_incident) %>% distinct() %>%
group_by(hazard_group, date_published_year_month) %>%
#filter(hazard_group %in% c("microbial contaminants (other)", "pathogenic micro-organisms", "allergens")) %>%
mutate(count = n())%>%
distinct()
pc5 <-ggplot(data = dat5, aes(x = date_published_year_month , y = count, group = hazard_group)) +
#geom_line(aes(color = hazard_group, alpha = 1), size = 1) +
geom_point(aes(color = hazard_group, alpha = 1), size = 3) +
geom_smooth(aes(x = date_published_year_month , y = count, color = hazard_group))+
#scale_x_continuous(breaks = sort(unique(dat$date_published_year))[c(TRUE, FALSE)] )+
theme(legend.position = "right", ncol=1) +
#scale_colour_manual(values=c(pal))+
theme_minimal_hgrid(9, rel_small = 1) +
labs(x = "year", colour="hazard group",
y = "counts",
title = "")+
guides(alpha = FALSE) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
pc5

# all foreign bodies
dat7<-data %>%
filter(!is.na(hazard_group),
date_published_year_month != "NA-NA") %>%
group_by(hazard_group) %>%
filter(n()>500) %>%
ungroup %>%
select(hazard_group, date_published_year_month, ID_incident) %>% distinct() %>%
group_by(hazard_group, date_published_year_month) %>%
filter(hazard_group == "foreign bodies") %>%
mutate(count = n())%>%
distinct()
pc7 <-ggplot(data = dat7, aes(x = date_published_year_month , y = count, group = hazard_group)) +
#geom_line(aes(color = hazard_group, alpha = 1), size = 1) +
geom_point(aes(color = hazard_group, alpha = 1), size = 3) +
geom_smooth(aes(x = date_published_year_month , y = count, color = hazard_group))+
#scale_x_continuous(breaks = sort(unique(dat$date_published_year))[c(TRUE, FALSE)] )+
theme(legend.position = "right", ncol=1) +
#scale_colour_manual(values=c(pal))+
theme_minimal_hgrid(9, rel_small = 1) +
labs(x = "year", colour="hazard group",
y = "counts",
title = "")+
guides(alpha = FALSE) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
pc7

Add coordinates data
add_coordinates <- function(input){
## get coordinates
# get world map
wmap <- getMap(resolution="high")
# get centroids
centroids <- gCentroid(wmap, byid=TRUE)
# get a data.frame with centroids
geo_df <- as.data.frame(centroids)
colnames(geo_df) <- c("long", "lat")
geo_df <- geo_df %>% tibble::rownames_to_column("country")
# update names in data
input<- input %>%
mutate( origin_country = case_when(origin_country == "USA" ~ "United States of America",
origin_country =="Hong Kong" ~ "Hong Kong S.A.R.",
origin_country =="Serbia" ~ "Republic of Serbia",
origin_country =="Bosnia Herzegovina" ~ "Bosnia and Herzegovina",
origin_country =="Tanzania" ~ "United Republic of Tanzania",
TRUE ~ origin_country)) %>%
mutate( notified_country = case_when(notified_country == "USA" ~ "United States of America",
notified_country =="Hong Kong" ~ "Hong Kong S.A.R.",
notified_country =="Serbia" ~ "Republic of Serbia",
notified_country =="Bosnia Herzegovina" ~ "Bosnia and Herzegovina",
notified_country =="Tanzania" ~ "United Republic of Tanzania",
TRUE ~ notified_country)) %>%
filter(!origin_country %in% c("Palestinian Territories", "INFOSAN" , "Commission Services", NA) | !notified_country %in% c("Palestinian Territories", "INFOSAN" , "Commission Services", NA))
# join with coords data
output <- input %>%
left_join(., geo_df, by = c("origin_country" = "country")) %>%
rename("lat.origin" = "lat",
"long.origin" = "long") %>%
left_join(., geo_df, by = c("notified_country" = "country")) %>%
rename("lat.notified" = "lat",
"long.notified" = "long") %>% drop_na()
return(output)
}
test <- add_coordinates(data)
save all with coords
data %>%
select(-link, -brand, -manufacturer, -organisation, -date_added, -date_added_year) %>%
add_coordinates() %>%
write_csv("../data/tidy/food_hazards_data_all.csv")
Add random coordinates
library(sp)
library(maptools)
data(wrld_simpl) # load data
data_all<-read_csv("../data/tidy/food_hazards_data_all.csv")
# update names in data to match source of coordiantes
input<- data_all %>%
mutate( origin_country = case_when(origin_country == "United States of America" ~ "United States",
origin_country == "Hong Kong S.A.R." ~ "Hong Kong",
origin_country == "Republic of Serbia" ~ "Serbia",
origin_country =="Vietnam" ~ "Viet Nam",
origin_country =="South Korea" ~ "Korea, Republic of",
origin_country =="Iran" ~ "Iran (Islamic Republic of)",
origin_country =="Syria" ~ "Syrian Arab Republic",
origin_country =="Laos" ~ "Lao People's Democratic Republic",
origin_country =="Moldova" ~ "Republic of Moldova",
origin_country =="Brunei" ~ "Brunei Darussalam",
origin_country =="Kosovo" ~ "Serbia",
origin_country =="Myanmar" ~ "Burma",
TRUE ~ origin_country)) %>%
mutate( notified_country = case_when(notified_country == "United States of America" ~ "United States",
notified_country == "Hong Kong S.A.R." ~ "Hong Kong",
notified_country == "Republic of Serbia" ~ "Serbia",
notified_country =="Vietnam" ~ "Viet Nam",
notified_country =="South Korea" ~ "Korea, Republic of",
notified_country =="Iran" ~ "Iran (Islamic Republic of)",
notified_country =="Syria" ~ "Syrian Arab Republic",
notified_country =="Laos" ~ "Lao People's Democratic Republic",
notified_country =="Moldova" ~ "Republic of Moldova",
notified_country =="Brunei" ~ "Brunei Darussalam",
notified_country =="Kosovo" ~ "Serbia",
notified_country =="Myanmar" ~ "Burma",
TRUE ~ notified_country))
get_random_cooor_single <- function(country_name){
if (!country_name %in% wrld_simpl$NAME){
stop(paste0("Polygon not available for ", country_name))
}
# get country polygon
sw <- slot(wrld_simpl[wrld_simpl$NAME == country_name,], "polygons")[[1]]
# sample a random coord (n) within that polygon
rpoints <- sp::spsample(sw, n = 1, type = "random") %>% as.data.frame() # long-lat
# convert to vector
rpoints_vec <- paste(as.numeric(rpoints[1,]), collapse = ",")
return(rpoints_vec)
}
get_random_cooor <- function(country_name, n){
if (!country_name %in% wrld_simpl$NAME){
stop(paste0("Polygon not available for ", country_name))
}
# get country polygon
sw <- slot(wrld_simpl[wrld_simpl$NAME == country_name,], "polygons")[[1]]
# sample a random coord (n) within that polygon
rpoints <- sp::spsample(sw, n = n, type = "random") %>% as.data.frame() # long-lat
colnames(rpoints) <- c("long", "lat")
return(rpoints)
}
## Create random coords for Origin countries
subset_origin <-input %>% select(ID, origin_country)
origin_coords <- data.frame()
for (country in unique(subset_origin$origin_country) ){
# subset to country
country_subset<-subset_origin %>% filter(origin_country == country)
# generate rand coords for n cases
tmp<-get_random_cooor(country, n = dim(country_subset)[1])
# addign them to case IDs (order does not matter)
tmp2<-cbind(country_subset, tmp)
# merge into df that will hold all coords for origin data points
origin_coords <- rbind(origin_coords, tmp2)
}
colnames(origin_coords)[3:4] <- c('long.origin.rand', 'lat.origin.rand')
## Create random coords for Notified countries
subset_notified <-input %>% select(ID, notified_country)
notified_coords <- data.frame()
for (country in unique(subset_notified$notified_country) ){
# subset to country
country_subset<-subset_notified %>% filter(notified_country == country)
# generate rand coords for n cases
tmp<-get_random_cooor(country, n = dim(country_subset)[1])
# addign them to case IDs (order does not matter)
tmp2<-cbind(country_subset, tmp)
# merge into df that will hold all coords for notified data points
notified_coords <- rbind(notified_coords, tmp2)
}
colnames(notified_coords)[3:4] <- c('long.notified.rand', 'lat.notified.rand')
## merge new rand coords by case ID inro one df
coords_rand <- full_join(origin_coords, notified_coords, by="ID") %>%
select(-origin_country, -notified_country)
## add new random coords to the main dataset
data_all_upd <- full_join(data_all, coords_rand, by ="ID")
write_csv(data_all_upd, "../data/tidy/food_hazards_data_all.csv")